# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Machine learning related models
from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, PowerTransformer, FunctionTransformer, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
# Statistics
from scipy.stats import ttest_ind, f_oneway
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
class EDA_basic:
def __init__(self, df=None, link=None, category_features=None):
if df is not None:
self.df = df
elif link is not None:
self.df = pd.read_csv(link)
else:
raise ValueError("Either 'df' or 'link' must be provided.")
self.category_features = category_features if category_features is not None else []
self.df.columns = self.df.columns.str.lower().str.replace(' ', '_')
def basic_statistical_summary(self):
print(self.df.head(10)) # The first 10 rows
print(self.df.info()) # Check the data types of all features
print(self.df.isna().any()) # Check if the data has missing values which it doesn't
print(self.df.describe())
# This method provides basic statistics about the dataset
def features_dtype(self):
categorical_features = [feature for feature in self.df.columns if self.df[feature].dtype == 'O']
# Help me give featues that are cateogrical
numerical_features = [feature for feature in self.df.columns if self.df[feature].dtype != 'O']
# Help me give featuers that are numerical
# Then, I will return those lists
return categorical_features, numerical_features
def skewness_(self):
_, numerical_features = self.features_dtype()
# I am using the previous function to get numerical features and print out the degree of skewness
print('The degree of skewness for each numerical feature: \n')
result = {feature: self.df[feature].skew() for feature in numerical_features} # Put the result in a dictionary
result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True)) # Then sort the result
for feature, skewness in result.items():
print(f'{feature}: {skewness:.2f}')
# Check for skewness, whether the distribution is right or left skewed, and the degree of the skewness will be sorted
def histogram_plot_numerical(self):
# Get the numerical features
_, numerical_features = self.features_dtype()
n = len(numerical_features) # Get the number of the numerical features
cols = 3 # Specify the number of cols for subplots
rows = (n + cols - 1) // cols # Get the rows according to the cols' number
fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(15, 5 * rows)) # Specify the subplots
axes = axes.flatten()
# This subplots will present the distributions of each numerical features for outliers detections and whether
# the data needs to be transformed
for i, feature in enumerate(numerical_features):
# Plot out histogram for each of the features
sns.histplot(self.df[feature], bins=30, alpha=0.7, ax=axes[i], label=feature, kde=True)
axes[i].set_xlabel(feature)
axes[i].set_ylabel('Frequency')
axes[i].legend()
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
plt.tight_layout()
plt.show() # Show the plot
def target_imbalance_plot_check(self, target):
# As the characteristic of churn rate problems, the target will have imbalanced values
ax = self.df[target].value_counts(normalize=True).plot(kind='bar') # Check the percentages of each values for
# imbalance detection
for p in ax.patches:
ax.annotate(f'{p.get_height():.2%}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 5), textcoords='offset points')
# Adding annotation for better and more informative visualization
plt.xlabel(target)
plt.ylabel(f'{target} rate')
plt.show()
def outliers_detection(self):
# Using ensemble method which is Isolation forest for outlier detection that takes into account other features
iso = IsolationForest(contamination=0.05, random_state=42) # Setting 5% of the data points as outliers
le = LabelEncoder() # Deal with categorical features
categorical_feature, _ = self.features_dtype() # Get the categorical features
for feature in categorical_feature:
self.df[feature] = le.fit_transform(self.df[feature]) # Transform categorical features to numerical features
features = self.df.drop('churn', axis=1) # Exclude the target feature
iso.fit(features)
# Outlier prediction
self.df['outliers'] = iso.predict(features) # Create another column for outliers which have the value of -1 if outliers
# else 1
# Count out the number of outliers
count_outliers = self.df['outliers'].value_counts()
print(f'There are {count_outliers[-1]} number of outliers')
outliers = self.df[self.df['outliers'] == -1] # Get outliers that take the value of -1
inliers = self.df[self.df['outliers']==1] # Get inliers - normal data points, takes the value of 1
print(outliers)
return outliers # I only what to return the outliers
df = pd.read_csv(r"C:\Users\Admin\Downloads\churn-bigml-80.csv")
eda = EDA_basic(df=df)
eda.basic_statistical_summary()
categorical_features, numerical_features = eda.features_dtype()
eda.histogram_plot_numerical()
eda.skewness_()
eda.target_imbalance_plot_check('churn')
outliers = eda.outliers_detection()
state account_length area_code international_plan voice_mail_plan \
0 KS 128 415 No Yes
1 OH 107 415 No Yes
2 NJ 137 415 No No
3 OH 84 408 Yes No
4 OK 75 415 Yes No
5 AL 118 510 Yes No
6 MA 121 510 No Yes
7 MO 147 415 Yes No
8 WV 141 415 Yes Yes
9 RI 74 415 No No
number_vmail_messages total_day_minutes total_day_calls \
0 25 265.1 110
1 26 161.6 123
2 0 243.4 114
3 0 299.4 71
4 0 166.7 113
5 0 223.4 98
6 24 218.2 88
7 0 157.0 79
8 37 258.6 84
9 0 187.7 127
total_day_charge total_eve_minutes total_eve_calls total_eve_charge \
0 45.07 197.4 99 16.78
1 27.47 195.5 103 16.62
2 41.38 121.2 110 10.30
3 50.90 61.9 88 5.26
4 28.34 148.3 122 12.61
5 37.98 220.6 101 18.75
6 37.09 348.5 108 29.62
7 26.69 103.1 94 8.76
8 43.96 222.0 111 18.87
9 31.91 163.4 148 13.89
total_night_minutes total_night_calls total_night_charge \
0 244.7 91 11.01
1 254.4 103 11.45
2 162.6 104 7.32
3 196.9 89 8.86
4 186.9 121 8.41
5 203.9 118 9.18
6 212.6 118 9.57
7 211.8 96 9.53
8 326.4 97 14.69
9 196.0 94 8.82
total_intl_minutes total_intl_calls total_intl_charge \
0 10.0 3 2.70
1 13.7 3 3.70
2 12.2 5 3.29
3 6.6 7 1.78
4 10.1 3 2.73
5 6.3 6 1.70
6 7.5 7 2.03
7 7.1 6 1.92
8 11.2 5 3.02
9 9.1 5 2.46
customer_service_calls churn
0 1 False
1 1 False
2 0 False
3 2 False
4 3 False
5 0 False
6 3 False
7 0 False
8 0 False
9 0 False
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 state 2666 non-null object
1 account_length 2666 non-null int64
2 area_code 2666 non-null int64
3 international_plan 2666 non-null object
4 voice_mail_plan 2666 non-null object
5 number_vmail_messages 2666 non-null int64
6 total_day_minutes 2666 non-null float64
7 total_day_calls 2666 non-null int64
8 total_day_charge 2666 non-null float64
9 total_eve_minutes 2666 non-null float64
10 total_eve_calls 2666 non-null int64
11 total_eve_charge 2666 non-null float64
12 total_night_minutes 2666 non-null float64
13 total_night_calls 2666 non-null int64
14 total_night_charge 2666 non-null float64
15 total_intl_minutes 2666 non-null float64
16 total_intl_calls 2666 non-null int64
17 total_intl_charge 2666 non-null float64
18 customer_service_calls 2666 non-null int64
19 churn 2666 non-null bool
dtypes: bool(1), float64(8), int64(8), object(3)
memory usage: 398.5+ KB
None
state False
account_length False
area_code False
international_plan False
voice_mail_plan False
number_vmail_messages False
total_day_minutes False
total_day_calls False
total_day_charge False
total_eve_minutes False
total_eve_calls False
total_eve_charge False
total_night_minutes False
total_night_calls False
total_night_charge False
total_intl_minutes False
total_intl_calls False
total_intl_charge False
customer_service_calls False
churn False
dtype: bool
account_length area_code number_vmail_messages total_day_minutes \
count 2666.000000 2666.000000 2666.000000 2666.00000
mean 100.620405 437.438860 8.021755 179.48162
std 39.563974 42.521018 13.612277 54.21035
min 1.000000 408.000000 0.000000 0.00000
25% 73.000000 408.000000 0.000000 143.40000
50% 100.000000 415.000000 0.000000 179.95000
75% 127.000000 510.000000 19.000000 215.90000
max 243.000000 510.000000 50.000000 350.80000
total_day_calls total_day_charge total_eve_minutes total_eve_calls \
count 2666.000000 2666.000000 2666.000000 2666.000000
mean 100.310203 30.512404 200.386159 100.023631
std 19.988162 9.215733 50.951515 20.161445
min 0.000000 0.000000 0.000000 0.000000
25% 87.000000 24.380000 165.300000 87.000000
50% 101.000000 30.590000 200.900000 100.000000
75% 114.000000 36.700000 235.100000 114.000000
max 160.000000 59.640000 363.700000 170.000000
total_eve_charge total_night_minutes total_night_calls \
count 2666.000000 2666.000000 2666.000000
mean 17.033072 201.168942 100.106152
std 4.330864 50.780323 19.418459
min 0.000000 43.700000 33.000000
25% 14.050000 166.925000 87.000000
50% 17.080000 201.150000 100.000000
75% 19.980000 236.475000 113.000000
max 30.910000 395.000000 166.000000
total_night_charge total_intl_minutes total_intl_calls \
count 2666.000000 2666.000000 2666.000000
mean 9.052689 10.237022 4.467367
std 2.285120 2.788349 2.456195
min 1.970000 0.000000 0.000000
25% 7.512500 8.500000 3.000000
50% 9.050000 10.200000 4.000000
75% 10.640000 12.100000 6.000000
max 17.770000 20.000000 20.000000
total_intl_charge customer_service_calls
count 2666.000000 2666.000000
mean 2.764490 1.562641
std 0.752812 1.311236
min 0.000000 0.000000
25% 2.300000 1.000000
50% 2.750000 1.000000
75% 3.270000 2.000000
max 5.400000 9.000000
The degree of skewness for each numerical feature: churn: 2.01 total_intl_calls: 1.36 number_vmail_messages: 1.27 area_code: 1.11 customer_service_calls: 1.10 account_length: 0.08 total_night_minutes: 0.02 total_night_charge: 0.02 total_night_calls: 0.01 total_eve_charge: -0.01 total_eve_minutes: -0.01 total_day_charge: -0.05 total_day_minutes: -0.05 total_eve_calls: -0.07 total_day_calls: -0.13 total_intl_minutes: -0.22 total_intl_charge: -0.22
There are 134 number of outliers
state account_length area_code international_plan voice_mail_plan \
8 49 141 415 1 1
27 18 172 408 0 0
35 20 135 408 1 1
83 12 98 510 0 1
93 21 36 510 1 1
... ... ... ... ... ...
2581 34 150 415 0 1
2587 8 75 510 0 1
2597 27 77 408 1 1
2598 36 146 510 0 0
2631 22 119 510 1 1
number_vmail_messages total_day_minutes total_day_calls \
8 37 258.6 84
27 0 212.0 121
35 41 173.1 85
83 21 161.2 114
93 42 196.8 89
... ... ... ...
2581 35 139.6 72
2587 28 200.6 96
2597 44 103.2 117
2598 0 138.4 104
2631 22 172.1 119
total_day_charge total_eve_minutes ... total_eve_charge \
8 43.96 222.0 ... 18.87
27 36.04 31.2 ... 2.65
35 29.43 203.9 ... 17.33
83 27.40 252.2 ... 21.44
93 33.46 254.9 ... 21.67
... ... ... ... ...
2581 23.73 332.8 ... 28.29
2587 34.10 164.1 ... 13.95
2597 17.54 236.3 ... 20.09
2598 23.53 158.9 ... 13.51
2631 29.26 223.6 ... 19.01
total_night_minutes total_night_calls total_night_charge \
8 326.4 97 14.69
27 293.3 78 13.20
35 122.2 78 5.50
83 160.2 92 7.21
93 138.3 126 6.22
... ... ... ...
2581 213.8 105 9.62
2587 169.6 153 7.63
2597 203.5 101 9.16
2598 47.4 73 2.13
2631 150.0 94 6.75
total_intl_minutes total_intl_calls total_intl_charge \
8 11.2 5 3.02
27 12.6 10 3.40
35 14.6 15 3.94
83 4.4 8 1.19
93 20.0 6 5.40
... ... ... ...
2581 8.8 2 2.38
2587 2.5 5 0.68
2597 11.9 2 3.21
2598 3.9 9 1.05
2631 13.9 20 3.75
customer_service_calls churn outliers
8 0 False -1
27 3 False -1
35 0 True -1
83 4 False -1
93 0 True -1
... ... ... ...
2581 2 False -1
2587 1 False -1
2597 0 True -1
2598 4 True -1
2631 1 True -1
[134 rows x 21 columns]
First I will specifize which featueres are categoircal or numerical and if there are any needed to be converted
df_copy = df.copy()
category_fetures = ['area_code', 'state', 'international_plan', 'voice_mail_plan']
for i in category_fetures:
df[i]= df[i].astype('category')
numerical_features = [feature for feature in df.columns if feature not in category_fetures and feature !='churn']
numerical_features
['account_length', 'number_vmail_messages', 'total_day_minutes', 'total_day_calls', 'total_day_charge', 'total_eve_minutes', 'total_eve_calls', 'total_eve_charge', 'total_night_minutes', 'total_night_calls', 'total_night_charge', 'total_intl_minutes', 'total_intl_calls', 'total_intl_charge', 'customer_service_calls']
I have converted the 4 features in the categorical_features list above, which previously were objects, to categorical features becuase this helped reduce the memory usage. This can help the data run faster and more precise later on
I just plotted out the numerical data for checking the distributions
- Most of them are normally distributed which is a good sign
- There are a few features that don't follow the normal distributions namely:
- number_vmail_messages: The number of voice mail messages the customer has.
- customer_service_calls: The number of calls the customer made to customer service.
I believe that these features are inheritely non-normal distribution. Moreover, I can see that in the dataset, the customer don't use voice mail messages much, there are some outliers in this features.
Secondly the number of customer service calls made is frequently one, more than 3 or 4 are not frequently which is undrestandable
I just performed checking the skewnes of the numerical features. The results are quite predictable becuase of the histogram plots of them
- Three most notable right skewed (positive values) features are:
- total_intl_calls: Total number of international calls made by the customer
- number_vmail_messages: The number of voice mail messages the customer has
- customer_service_calls: The number of calls the customer made to customer service
- There are four closely normally distributed but slightly right skewed (with positive values) are:
- account_length: The number of days the customer has had an account with the company
- total_night_minutes: Total minutes of calls made by the customer during the night
- total_night_charge: Total charges incurred by the customer for calls made during the night
- total_night_calls: Total number of calls made by the customer during the night
I believe that these features are not highly skewed which may not need transformations
- There are eight left-skewed distributions but not severely, therefore, it's not necesarry to transform these features
As expected the churn rate is not blanaced which may required being delt with later on in the process
- Churn analysis by state
- Group the data by the state column and calculate the churn rate for each state to see which states have the highest and lowest churn rates and identify patterns
- International plan impact
- Group the data by the international plan column and calculate aggregate statistics for numerical features to determine if there's a significant difference in usage patterns between customers with and without international plans
- Voice mail plan analysis
- Do the same with international plan
category_features
le = LabelEncoder()
for feature in category_features:
df_copy[feature] = le.fit_transform(df_copy[feature])
plt.figure(figsize=(15, 12))
sns.pairplot(df, hue='churn')
<seaborn.axisgrid.PairGrid at 0x2c6cf7c2310>
<Figure size 1500x1200 with 0 Axes>